1 Setup

  • Libraries
suppressPackageStartupMessages({
  library(data.table)
  library(DESeq2)
  library(emoji)
  library(gplots)
  library(here)
  library(hyperSpec)
  library(parallel)
  library(pander)
  library(pheatmap)
  library(plotly)
  library(pvclust)
  library(RColorBrewer)
  library(scatterplot3d)
  library(tidyverse)
  library(tximport)
  library(vsn)
})
  • Helper functions
source(here("UPSCb-common/src/R/featureSelection.R"))
  • Graphics
pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
  • Metadata Sample information
samples <- read_delim(file = here("doc/Samples-swap-corrected.tsv"), 
                      delim="\t",
                      col_types=cols(
                        SampleID=col_factor(),
                        Time=col_factor(),
                        Replicate=col_factor())
                      ) %>% 
  mutate(Time=relevel(Time,"std"))

2 Raw data

filelist <- list.files(here("data/salmon"), 
                          recursive = TRUE, 
                          pattern = "quant.sf",
                          full.names = TRUE)

Sort the raw data and samples

samples <- samples[match(basename(dirname(filelist)),samples$SampleID),]
stopifnot(all(samples$SampleID == basename(dirname(filelist))))

name the file list vector

names(filelist) <- samples$SampleID

Read the expression at the transcript level (we have no gene information)

counts <- suppressMessages(round(tximport(files = filelist, 
                                  type = "salmon",txOut = TRUE)$counts))

Read the algae IDs

IDs <- scan(here("data/analysis/annotation/algae-IDs.txt"),
            what = "character")

Subset the data

counts <- counts[IDs,]

2.1 Quality Control

  • Check how many genes are never expressed
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "4.5% percent (2244) of 49477 genes are not expressed"
  • Let us take a look at the sequencing depth, colouring by Time
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>% 
  bind_cols(samples)
levels =  c("std", "60min", "4hrs", "12hrs", "24hrs", "72hrs", "120hrs")
ggplot(dat,aes(x,y,fill=factor(Time, levels))) +
  geom_col() + 
  scale_y_continuous(name="reads") + 
  scale_fill_manual(values = pal) + 
  theme(axis.text.x=element_text(angle=90,size=10),
        axis.title.x=element_blank()) +
  labs(fill = "Time")

  • Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) + 
  geom_density(na.rm = TRUE) + 
  ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")

The same is done for the individual samples colored by Time

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Time=samples$Time[match(ind,samples$SampleID)])
dat$Time <- factor(dat$Time, levels)

ggplot(dat,aes(x=values,group=ind,col=Time)) +
  geom_density(na.rm = TRUE) +
  ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)") + 
  scale_color_manual(values = pal[1:7])

2.2 Export

dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data.csv"))

3 Data normalisation

3.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.

dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples,
  design = ~ Time) %>%
  suppressMessages()

save(dds,file=here("data/analysis/salmon/dds-sample-swap-corrected.rda"))

Check the size factors (i.e. the sequencing library size effect)

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
# pander(sizes)
boxplot(sizes, main="Sequencing libraries size factor",
        las=2,log="y")
abline(h=1, col = "Red", lty = 3)

3.2 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=FALSE)
vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked adequately

  • before:
meanSdPlot(log1p(counts(dds))[rowSums(counts(dds))>0,])

  • after:
meanSdPlot(vst[rowSums(vst)>0,])

3.3 QC on the normalised data

3.3.1 PCA

pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
  • Cumulative components effect

We define the number of variable of the model

nvar=1

An the number of possible combinations

nlevel=nlevels(dds$Time)

We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.

ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
  geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
  scale_x_continuous("Principal component") + 
  geom_vline(xintercept=nvar,colour="red",linetype="dashed",linewidth=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
  geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",linewidth=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

⭐ The PCA shows that a large fraction of the variance is explained by first 3 components.

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    PC3=pc$x[,3],
                    samples)

#PC1 vs PC2
p1 <- pc.dat %>% 
  ggplot(mapping = aes(x=PC1,y=PC2,text=gsub(replacement = "", x = SampleID, pattern = "_B2_2"))) + 
  geom_point(size = 2) +
  aes(color=Time) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

p1 %<>% ggplotly() %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep=""))) %>% suppressWarnings()

#PC1 vs PC3
p2 <- pc.dat %>% 
  ggplot(mapping = aes(x=PC1,y=PC3,text=gsub(replacement = "", x = SampleID, pattern = "_B2_2"))) + 
  geom_point(size = 2) +
  aes(color=Time) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

p2 %<>% ggplotly() %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC3 (",percent[3],"%)",sep=""))) %>% suppressWarnings()
subplot(style(p1, showlegend = F), p2,
        titleX = TRUE, titleY = TRUE, nrows = 1, margin = c(0.05, 0.05, 0, 0))

3.3.2 Heatmap

Filter for noise

conds <- factor(dds$Time)
sels <- rangeFeatureSelect(counts=vst,
                           conditions=dds$Time,
                           nrep=3)

vst.cutoff <- 2
  • Heatmap of “all” genes
nn <- nrow(vst[sels[[vst.cutoff+1]],])
tn <- nrow(vst)
pn <- round(nn * 100/ tn, digits=1)

⚠️ 24.4% (12071) of total 49477 genes are plotted below:

mat <- t(scale(t(vst[sels[[vst.cutoff+1]],])))

#annotation column
colnames(mat) <- gsub("B2_2_","",colnames(mat))
col_anno <- samples %>% select(Time) %>% as.data.frame()
col_anno$Time <- factor(col_anno$Time, levels)
rownames(col_anno) <- colnames(mat)

#annotation colors
col_anno_col = pal[1:7]
names(col_anno_col) <- levels
col_anno_col=list(Time = col_anno_col)

hm <- pheatmap(mat,
               color = hpal,
               border_color = NA,
               clustering_distance_cols = "correlation",
               clustering_distance_rows = "correlation",
               clustering_method = "ward.D2",
               annotation_col = col_anno,
               show_rownames = F,
               labels_col = conds,
               angle_col = 90,
               annotation_colors = col_anno_col,
               legend = F)

plot(as.hclust(hm$tree_col),xlab="",sub="")

hm.pvclust <- suppressMessages(
  pvclust(data = t(scale(t(vst[sels[[vst.cutoff]],]))),
          method.hclust = "ward.D2", 
          nboot = 100, parallel = TRUE, quiet = T)
)

plot(hm.pvclust, labels = conds)
pvrect(hm.pvclust)

bootstrapping results as a table
print(hm.pvclust, digits=3)
## 
## Cluster method: ward.D2
## Distance      : correlation
## 
## Estimates on edges:
## 
##       si    au    bp se.si se.au se.bp      v      c  pchi
## 1  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 2  0.917 0.948 0.987 0.170 0.120 0.009 -1.933 -0.305 0.796
## 3  0.956 0.976 0.987 0.072 0.045 0.006 -2.100 -0.122 0.885
## 4  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 5  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 6  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 7  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 8  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 9  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 10 0.810 0.911 0.885 0.084 0.050 0.011 -1.273  0.072 0.502
## 11 0.994 0.998 0.994 0.022 0.011 0.007 -2.651  0.163 0.349
## 12 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 13 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 14 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 15 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 16 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 17 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 18 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 19 0.854 0.898 0.990 0.266 0.210 0.007 -1.795 -0.525 0.988
## 20 0.772 0.836 0.984 0.253 0.209 0.007 -1.562 -0.586 0.981
## 21 0.772 0.836 0.984 0.253 0.209 0.007 -1.562 -0.586 0.981
## 22 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 23 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 24 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 25 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 26 0.860 0.919 0.964 0.112 0.078 0.008 -1.599 -0.202 0.827
## 27 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000

 

4 Summary

⭐ The data looks very good after fixing the earlier samples swap. Both PCA and heatmap show separation between different times except for 72 and 120 hours which are mixed together. In PCA plots, the first component explains the majority of variation (27%) and separates samples at 1,4 and 12 hours from the rest (std, 24,72 and 120 hrs). The second component explains further variation (16%) separating early stages (std, 1 and 4 hours) from later stages. Together the PCA and Heatmap plots suggest that 4 and 12hrs conditions have the strongest effect. This is less in 60 min but still there is a clear effect. While 72hr and 120hr clustered separately from other times, they do not show much difference between themselves.

5 Session Info

Session Info
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] parallel  grid      stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] vsn_3.66.0                  tximport_1.26.1            
##  [3] lubridate_1.9.2             forcats_1.0.0              
##  [5] stringr_1.5.0               dplyr_1.1.1                
##  [7] purrr_1.0.1                 readr_2.1.4                
##  [9] tidyr_1.3.0                 tibble_3.2.1               
## [11] tidyverse_2.0.0             scatterplot3d_0.3-43       
## [13] RColorBrewer_1.1-3          pvclust_2.2-0              
## [15] plotly_4.10.1               pheatmap_1.0.12            
## [17] pander_0.6.5                hyperSpec_0.100.0          
## [19] xml2_1.3.3                  ggplot2_3.4.2              
## [21] lattice_0.20-45             here_1.0.1                 
## [23] gplots_3.1.3                emoji_15.0                 
## [25] DESeq2_1.38.3               SummarizedExperiment_1.28.0
## [27] Biobase_2.58.0              MatrixGenerics_1.10.0      
## [29] matrixStats_0.63.0          GenomicRanges_1.50.2       
## [31] GenomeInfoDb_1.34.9         IRanges_2.32.0             
## [33] S4Vectors_0.36.2            BiocGenerics_0.44.0        
## [35] data.table_1.14.8          
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.1-0       deldir_1.0-6           ellipsis_0.3.2        
##  [4] rprojroot_2.0.3        XVector_0.38.0         rstudioapi_0.14       
##  [7] hexbin_1.28.3          farver_2.1.1           affyio_1.68.0         
## [10] bit64_4.0.5            AnnotationDbi_1.60.2   fansi_1.0.4           
## [13] codetools_0.2-19       cachem_1.0.7           geneplotter_1.76.0    
## [16] knitr_1.42             jsonlite_1.8.4         annotate_1.76.0       
## [19] png_0.1-8              BiocManager_1.30.20    compiler_4.2.0        
## [22] httr_1.4.5             Matrix_1.5-3           fastmap_1.1.1         
## [25] lazyeval_0.2.2         limma_3.54.2           cli_3.6.1             
## [28] htmltools_0.5.5        tools_4.2.0            gtable_0.3.3          
## [31] glue_1.6.2             GenomeInfoDbData_1.2.9 affy_1.76.0           
## [34] Rcpp_1.0.10            jquerylib_0.1.4        vctrs_0.6.1           
## [37] Biostrings_2.66.0      preprocessCore_1.60.2  crosstalk_1.2.0       
## [40] xfun_0.38              brio_1.1.3             testthat_3.1.7        
## [43] timechange_0.2.0       lifecycle_1.0.3        gtools_3.9.4          
## [46] XML_3.99-0.14          zlibbioc_1.44.0        scales_1.2.1          
## [49] vroom_1.6.1            hms_1.1.3              yaml_2.3.7            
## [52] memoise_2.0.1          sass_0.4.5             latticeExtra_0.6-30   
## [55] stringi_1.7.12         RSQLite_2.3.1          highr_0.10            
## [58] caTools_1.18.2         BiocParallel_1.32.6    rlang_1.1.0           
## [61] pkgconfig_2.0.3        bitops_1.0-7           evaluate_0.20         
## [64] labeling_0.4.2         htmlwidgets_1.6.2      bit_4.0.5             
## [67] tidyselect_1.2.0       magrittr_2.0.3         R6_2.5.1              
## [70] generics_0.1.3         DelayedArray_0.24.0    DBI_1.1.3             
## [73] pillar_1.9.0           withr_2.5.0            KEGGREST_1.38.0       
## [76] RCurl_1.98-1.12        crayon_1.5.2           interp_1.1-4          
## [79] KernSmooth_2.23-20     utf8_1.2.3             tzdb_0.3.0            
## [82] rmarkdown_2.21         jpeg_0.1-10            locfit_1.5-9.7        
## [85] blob_1.2.4             digest_0.6.31          xtable_1.8-4          
## [88] munsell_0.5.0          viridisLite_0.4.1      bslib_0.4.2

 

 

drawing

Created by Aman Zare

aman.zare@umu.se